{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "dd08c6f4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b84f8e2bb56a4d51935e3076f393e049",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Fetching 21 files:   0%|          | 0/21 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'/home/mesolitica/stt/Classification-Speech-Instructions'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from huggingface_hub import snapshot_download\n",
    "\n",
    "snapshot_download(\n",
    "    repo_id=\"mesolitica/Classification-Speech-Instructions\",\n",
    "    repo_type='dataset',\n",
    "    allow_patterns=\"data/*.parquet\",\n",
    "    local_dir=\"./Classification-Speech-Instructions\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "6bd46a01",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Classification-Speech-Instructions/data/emotion-00001-of-00003.parquet',\n",
       " 'Classification-Speech-Instructions/data/emotion-00002-of-00003.parquet',\n",
       " 'Classification-Speech-Instructions/data/emotion-00000-of-00003.parquet']"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "import json\n",
    "import pandas as pd\n",
    "\n",
    "files = glob('Classification-Speech-Instructions/data/emotion-*.parquet')\n",
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "bdaec324",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████| 3894/3894 [00:00<00:00, 21610.03it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 3893/3893 [00:00<00:00, 21301.27it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 3894/3894 [00:00<00:00, 21660.29it/s]\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "from tqdm import tqdm\n",
    "\n",
    "emotions = []\n",
    "for f in files:\n",
    "    f_only = os.path.split(f)[1].replace('.parquet', '')\n",
    "    df = pd.read_parquet(f)\n",
    "    for i in tqdm(range(len(df))):\n",
    "        d = json.loads(df['metadata'].iloc[i])\n",
    "        new_f = os.path.join('Classification-Speech-Instructions-audio', \n",
    "                                 f'{f_only}-{i}.mp3')\n",
    "        if not os.path.exists(new_f):\n",
    "            continue\n",
    "        m = df.iloc[i].to_dict()\n",
    "        m.pop('audio_filename')\n",
    "        emotions.append({\n",
    "            'audio_filename': new_f,\n",
    "            'metadata': json.dumps(m),\n",
    "            'answer': d['emotion'],\n",
    "        })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "e32a92ba",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"['disgust', 'neutral', 'happy', 'sadness', 'anger', 'calm', 'fear', 'surprise']\""
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unique_emotions = str(list(set([r['answer'] for r in emotions])))\n",
    "unique_emotions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "d04d2fe2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "\n",
    "questions = [\n",
    "    'given the labels\\n{labels}\\n\\nwhat is the label for the audio',\n",
    "    'what is the label for audio\\n\\nthe labels: {labels}'\n",
    "]\n",
    "\n",
    "for i in range(len(emotions)):\n",
    "    emotions[i]['question'] = random.choice(questions).format(labels = unique_emotions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "8bd99c08",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'audio_filename': 'Classification-Speech-Instructions-audio/emotion-00001-of-00003-0.mp3',\n",
       " 'metadata': '{\"question\": \"Can you tell what the emotion is in this recording?\", \"answer\": \"The speaker seems to express a sense of disgust, indicating a strong negative reaction to what has happened. The intensity of this emotion is not clear, but it\\'s evident that they are deeply bothered by the situation.\", \"metadata\": \"{\\\\\"filename\\\\\": \\\\\"ssi-speech-emotion-recognition-audio/ssi-speech-emotion-recognition_data_validation-00000-of-00001_706.wav\\\\\", \\\\\"text\\\\\": \\\\\"That Is Exactly what happened\\\\\", \\\\\"emotion\\\\\": \\\\\"disgust\\\\\", \\\\\"intensity\\\\\": \\\\\"unknown\\\\\"}\", \"source\": \"stapesai/ssi-speech-emotion-recognition\"}',\n",
       " 'answer': 'disgust',\n",
       " 'question': \"given the labels\\n['disgust', 'neutral', 'happy', 'sadness', 'anger', 'calm', 'fear', 'surprise']\\n\\nwhat is the label for the audio\"}"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "emotions[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "aa3cb8da",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Classification-Speech-Instructions/data/emotion_part2-00004-of-00007.parquet',\n",
       " 'Classification-Speech-Instructions/data/emotion_part2-00001-of-00007.parquet',\n",
       " 'Classification-Speech-Instructions/data/emotion_part2-00003-of-00007.parquet',\n",
       " 'Classification-Speech-Instructions/data/emotion_part2-00000-of-00007.parquet',\n",
       " 'Classification-Speech-Instructions/data/emotion_part2-00005-of-00007.parquet',\n",
       " 'Classification-Speech-Instructions/data/emotion_part2-00002-of-00007.parquet',\n",
       " 'Classification-Speech-Instructions/data/emotion_part2-00006-of-00007.parquet']"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files = glob('Classification-Speech-Instructions/data/emotion_part2-*.parquet')\n",
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "b4287647",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████| 985/985 [00:00<00:00, 20878.89it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 985/985 [00:00<00:00, 20527.93it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 985/985 [00:00<00:00, 21157.86it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 985/985 [00:00<00:00, 21259.22it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 984/984 [00:00<00:00, 20330.01it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 985/985 [00:00<00:00, 21880.51it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████| 984/984 [00:00<00:00, 21349.81it/s]\n"
     ]
    }
   ],
   "source": [
    "emotions_part2 = []\n",
    "for f in files:\n",
    "    f_only = os.path.split(f)[1].replace('.parquet', '')\n",
    "    df = pd.read_parquet(f)\n",
    "    for i in tqdm(range(len(df))):\n",
    "        d = json.loads(df['metadata'].iloc[i])\n",
    "        new_f = os.path.join('Classification-Speech-Instructions-audio', \n",
    "                                 f'{f_only}-{i}.mp3')\n",
    "        if not os.path.exists(new_f):\n",
    "            continue\n",
    "        m = df.iloc[i].to_dict()\n",
    "        m.pop('audio_filename')\n",
    "        emotions_part2.append({\n",
    "            'audio_filename': new_f,\n",
    "            'metadata': json.dumps(m),\n",
    "            'answer': d['emotion'],\n",
    "        })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "761ba010",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"['Disgusted', 'Angry', 'Neutral', 'Amused', 'Sleepy']\""
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unique_emotions = str(list(set([r['answer'] for r in emotions_part2])))\n",
    "unique_emotions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "e557664c",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(emotions_part2)):\n",
    "    emotions_part2[i]['question'] = random.choice(questions).format(labels = unique_emotions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "832ec0f0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'audio_filename': 'Classification-Speech-Instructions-audio/emotion_part2-00004-of-00007-0.mp3',\n",
       " 'metadata': '{\"question\": \"Can you identify the emotional tone of this sound?\", \"answer\": \"The audio conveys a neutral emotion, reflecting a calm and matter-of-fact tone. It doesn\\'t carry any strong feelings or intense reactions, suggesting a straightforward and composed delivery.\", \"metadata\": \"{\\\\\"filename\\\\\": \\\\\"EmoV_DB_audio/EmoV_DB_data_train_train-home_knoriy_fsx_processed_datasets_EmoV_db_EmoV_DB_tars_train_1224-home_knoriy_fsx_processed_datasets_EmoV_db_EmoV_DB_tars_train_3480-batch143_15.wav\\\\\", \\\\\"text\\\\\": \\\\\"Next to them the Canada jays were most persistent.\\\\\", \\\\\"emotion\\\\\": \\\\\"Neutral\\\\\"}\", \"source\": \"CLAPv2/EmoV_DB\"}',\n",
       " 'answer': 'Neutral',\n",
       " 'question': \"what is the label for audio\\n\\nthe labels: ['Disgusted', 'Angry', 'Neutral', 'Amused', 'Sleepy']\"}"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "emotions_part2[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "fae017dd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Classification-Speech-Instructions/data/emotion_part3-00000-of-00001.parquet']"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files = glob('Classification-Speech-Instructions/data/emotion_part3-*.parquet')\n",
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "e0f5b513",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████| 1400/1400 [00:00<00:00, 20474.07it/s]\n"
     ]
    }
   ],
   "source": [
    "emotions_part3 = []\n",
    "for f in files:\n",
    "    f_only = os.path.split(f)[1].replace('.parquet', '')\n",
    "    df = pd.read_parquet(f)\n",
    "    for i in tqdm(range(len(df))):\n",
    "        d = json.loads(df['metadata'].iloc[i])\n",
    "        new_f = os.path.join('Classification-Speech-Instructions-audio', \n",
    "                                 f'{f_only}-{i}.mp3')\n",
    "        if not os.path.exists(new_f):\n",
    "            continue\n",
    "        m = df.iloc[i].to_dict()\n",
    "        m.pop('audio_filename')\n",
    "        emotions_part3.append({\n",
    "            'audio_filename': new_f,\n",
    "            'metadata': json.dumps(m),\n",
    "            'answer': d['emotion'],\n",
    "        })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "825d29d4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"['neutral', 'happy', 'angry', 'disgusted', 'surprised', 'sad', 'fearful']\""
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unique_emotions = str(list(set([r['answer'] for r in emotions_part3])))\n",
    "unique_emotions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "db309e91",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(emotions_part3)):\n",
    "    emotions_part3[i]['question'] = random.choice(questions).format(labels = unique_emotions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "0cbc2685",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(11681, 6893, 1400)"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(emotions), len(emotions_part2), len(emotions_part3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "cfcf917f",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8.7G\tClassification-Speech-Instructions-audio\r\n"
     ]
    }
   ],
   "source": [
    "!du -hs Classification-Speech-Instructions-audio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "7f093b16",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import Dataset\n",
    "\n",
    "dataset = Dataset.from_list(emotions + emotions_part2 + emotions_part3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "8da34f1c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0e156c8dde8f4138b0bb1bf5ed2c24ff",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "375e6b6c325945a6a6984086eec1ced5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "21309b84970b4ff38169416b58e968c0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading...:   0%|          | 0.00/3.69M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/commit/cdbe3f363f617f73a5bb96c19db1bb010e716a32', commit_message='Upload dataset', commit_description='', oid='cdbe3f363f617f73a5bb96c19db1bb010e716a32', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Zeroshot-Audio-Classification-Instructions'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.push_to_hub('mesolitica/Zeroshot-Audio-Classification-Instructions', split = 'emotion')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "eedb77c9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "117508"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "\n",
    "audio_files = glob('Classification-Speech-Instructions-audio/*.mp3')\n",
    "len(audio_files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "642d0c93",
   "metadata": {},
   "outputs": [],
   "source": [
    "import zipfile\n",
    "with zipfile.ZipFile('Classification-Speech-Instructions-audio.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:\n",
    "    for f in audio_files:\n",
    "        zipf.write(f, arcname=f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "532cddb3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Uploading files using Xet Storage..\n",
      "Uploading...: 100%|███████████████████████▉| 7.52G/7.52G [01:45<00:00, 71.5MB/s]\n",
      "https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/blob/main/Classification-Speech-Instructions-audio.zip\n"
     ]
    }
   ],
   "source": [
    "!huggingface-cli upload mesolitica/Zeroshot-Audio-Classification-Instructions Classification-Speech-Instructions-audio.zip \\\n",
    "--repo-type=dataset"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
